{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19lIv8MOT1gY6vd3fWU4Mr4ciZMhXI44t?usp=sharing)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "HTNVpcYyUmOF"
   },
   "outputs": [],
   "source": [
    "!pip install pandas\n",
    "!pip install scrapy\n",
    "!pip install wikipedia"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from scrapy import Selector\n",
    "from urllib.parse import urljoin\n",
    "import re\n",
    "import pandas as pd\n",
    "from random import choice\n",
    "import wikipedia"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "background_save": true
    },
    "id": "Mmb6GNTzUssD"
   },
   "outputs": [],
   "source": [
    "def number_filtering(texto: str):\n",
    "    numeros = re.findall(r\"\\d+\", texto)\n",
    "    resultado = \"\".join(numeros)\n",
    "    return resultado"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "id": "hbTDxPlNs2uk"
   },
   "outputs": [],
   "source": [
    "questions = [\n",
    "    \"Write a description of the film\",\n",
    "    \"Write a description about the film\",\n",
    "    \"I would like you to summarise the film\",\n",
    "    \"Can you write a summary of the film\",\n",
    "    \"Summarises the film\"\n",
    "    # ...\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "id": "NF_mdubFsohr"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Empty DataFrame\n",
      "Columns: [INSTRUCTION, RESPONSE, SOURCE]\n",
      "Index: []\n",
      "                                     INSTRUCTION  \\\n",
      "0  Write a description about the film Knives Out   \n",
      "\n",
      "                                            RESPONSE                 SOURCE  \n",
      "0  Knives Out is a 2019 American mystery film wri...  Wikipedia & Letterbox  \n",
      "                                         INSTRUCTION  \\\n",
      "0      Write a description about the film Knives Out   \n",
      "1  Write a description of the film Everything Eve...   \n",
      "\n",
      "                                            RESPONSE                 SOURCE  \n",
      "0  Knives Out is a 2019 American mystery film wri...  Wikipedia & Letterbox  \n",
      "1  Everything Everywhere All at Once is a 2022 Am...  Wikipedia & Letterbox  \n",
      "                                         INSTRUCTION  \\\n",
      "0      Write a description about the film Knives Out   \n",
      "1  Write a description of the film Everything Eve...   \n",
      "2              Write a description of the film Joker   \n",
      "\n",
      "                                            RESPONSE                 SOURCE  \n",
      "0  Knives Out is a 2019 American mystery film wri...  Wikipedia & Letterbox  \n",
      "1  Everything Everywhere All at Once is a 2022 Am...  Wikipedia & Letterbox  \n",
      "2  Joker is a 2019 American psychological thrille...  Wikipedia & Letterbox  \n",
      "                                         INSTRUCTION  \\\n",
      "0      Write a description about the film Knives Out   \n",
      "1  Write a description of the film Everything Eve...   \n",
      "2              Write a description of the film Joker   \n",
      "3  I would like you to summarise the film The Batman   \n",
      "\n",
      "                                            RESPONSE                 SOURCE  \n",
      "0  Knives Out is a 2019 American mystery film wri...  Wikipedia & Letterbox  \n",
      "1  Everything Everywhere All at Once is a 2022 Am...  Wikipedia & Letterbox  \n",
      "2  Joker is a 2019 American psychological thrille...  Wikipedia & Letterbox  \n",
      "3  The Batman is a 2022 American superhero film b...  Wikipedia & Letterbox  \n",
      "                                         INSTRUCTION  \\\n",
      "0      Write a description about the film Knives Out   \n",
      "1  Write a description of the film Everything Eve...   \n",
      "2              Write a description of the film Joker   \n",
      "3  I would like you to summarise the film The Batman   \n",
      "4     Can you write a summary of the film Fight Club   \n",
      "\n",
      "                                            RESPONSE                 SOURCE  \n",
      "0  Knives Out is a 2019 American mystery film wri...  Wikipedia & Letterbox  \n",
      "1  Everything Everywhere All at Once is a 2022 Am...  Wikipedia & Letterbox  \n",
      "2  Joker is a 2019 American psychological thrille...  Wikipedia & Letterbox  \n",
      "3  The Batman is a 2022 American superhero film b...  Wikipedia & Letterbox  \n",
      "4  Fight Club is a 1999 American film directed by...  Wikipedia & Letterbox  \n"
     ]
    }
   ],
   "source": [
    "home_url = \"https://letterboxd.com\"\n",
    "response = requests.get(\"https://letterboxd.com/films/ajax/popular/?esiAllowFilters=true\")\n",
    "df = pd.DataFrame(columns=[\"INSTRUCTION\", \"RESPONSE\", \"SOURCE\"], index=None)\n",
    "wikipedia.set_lang(\"en\")\n",
    "\n",
    "process = True\n",
    "while process:\n",
    "    selector1 = Selector(text=response.text)\n",
    "    # Get film urls (72 per page)\n",
    "    films_urls = selector1.css(\".listitem.poster-container div::attr(data-target-link)\").getall()\n",
    "    for url in films_urls:\n",
    "        # The process can take hours or days, so I recommend setting a limit on the number -\n",
    "        # of data to be obtained. Example:\n",
    "        if df.shape[0] == 5:\n",
    "            process = False\n",
    "            break\n",
    "\n",
    "        response = requests.get(urljoin(home_url, url))\n",
    "        selector = Selector(text=response.text)\n",
    "        film_title = selector.css(\"h1.headline-1.js-widont.prettify ::text\").get()\n",
    "        try:\n",
    "            # You can specify 'sentences: int' keyword to get less film description text\n",
    "            film_description = wikipedia.summary(f\"{film_title} film\", auto_suggest=True)\n",
    "        except wikipedia.DisambiguationError as e:\n",
    "            print(e)\n",
    "            results = wikipedia.search(film_title, 10, False)\n",
    "            for res in results[0]:\n",
    "                if film_title in res:\n",
    "                    film_description = wikipedia.summary(f\"{res}\", auto_suggest=False)\n",
    "        except wikipedia.PageError as e:\n",
    "            film_description = None\n",
    "\n",
    "        if film_description:\n",
    "            df = df.append(\n",
    "                {\n",
    "                    \"INSTRUCTION\": f\"{choice(questions)} {film_title}\",\n",
    "                    \"RESPONSE\": film_description,\n",
    "                    \"SOURCE\": \"Wikipedia & Letterbox\",\n",
    "                },\n",
    "                ignore_index=True,\n",
    "            )\n",
    "        print(df)\n",
    "\n",
    "    next_p_number = number_filtering(selector1.css(\".paginate-nextprev a.next::attr(href)\").get())\n",
    "    if next_p_number:\n",
    "        next_page = f\"https://letterboxd.com/films/ajax/popular/page/{next_p_number}?esiAllowFilters=true\"\n",
    "        response = requests.get(urljoin(home_url, next_page))\n",
    "    else:\n",
    "        process = False\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "M-oczi97ssHO"
   },
   "outputs": [],
   "source": [
    "# df.to_json(\"films.jsonl\", orient='records', lines=True)\n",
    "# df.to_parquet(\"films.parquet\", row_group_size=100, engine=\"pyarrow\")"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  },
  "vscode": {
   "interpreter": {
    "hash": "45797abf7cde6e9fd30a1bbf2712169c227f9ea75bcc0b0a4c5a4ecdb26b0eba"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
